#1. Load the librraies

library(ggplot2)
library(methods)
library(edgeR)
## Warning: package 'edgeR' was built under R version 4.3.1
## Loading required package: limma
## Warning: package 'limma' was built under R version 4.3.1
library(Seurat)
## The legacy packages maptools, rgdal, and rgeos, underpinning the sp package,
## which was just loaded, were retired in October 2023.
## Please refer to R-spatial evolution reports for details, especially
## https://r-spatial.org/r/2023/05/15/evolution4.html.
## It may be desirable to make the sf package available;
## package maintainers should consider adding sf to Suggests:.
## Attaching SeuratObject
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(rafalib)
library(devtools)
## Loading required package: usethis
## 
## Attaching package: 'devtools'
## The following object is masked from 'package:rafalib':
## 
##     install_bioc

#2. Import data # Change the directories for your data

GSE146771_CRC_Leukocyte_10x_Metadata<- read.csv("/Users/rxk519/Desktop/Projects/CRC_Project/CRC_datasets_GSE146771/GSE146771_CRC.Leukocyte.10x.Metadata.txt", sep = "\t")
vec.cell.type <- GSE146771_CRC_Leukocyte_10x_Metadata$CellName
names(vec.cell.type) <- GSE146771_CRC_Leukocyte_10x_Metadata$Global_Cluster

#counts
#GSE146771_CRC_Leukocyte_10x_TPM<- read.table("/Users/rxk519/Desktop/Projects/CRC_Project/CRC_datasets_GSE146771/GSE146771_CRC.Leukocyte.10x.TPM.txt", fill = TRUE, header = TRUE, sep = "")
#saveRDS(GSE146771_CRC_Leukocyte_10x_TPM, file = "/Users/rxk519/Desktop/Projects/CRC_Project/CRC_datasets_GSE146771/data/GSE146771_CRC_Leukocyte_10x_TPM.rds")


GSE146771_CRC_Leukocyte_10x_TPM<-readRDS("/Users/rxk519/Desktop/Projects/CRC_Project/CRC_datasets_GSE146771/data/GSE146771_CRC_Leukocyte_10x_TPM.rds")

#3.create a seurat project

#create a seurat object
dim(GSE146771_CRC_Leukocyte_10x_TPM) #dim of data
## [1] 13538 43817
pbmc <- CreateSeuratObject(counts = GSE146771_CRC_Leukocyte_10x_TPM, project = "CRC", min.cells = 3, min.features = 200)

pbmc
## An object of class Seurat 
## 13538 features across 43817 samples within 1 assay 
## Active assay: RNA (13538 features, 0 variable features)

#4.add variables to the seurat object #add the variable to the metadata(Sub_ClusterID, Global_Cluster, CellName) #CellName

cell.type <- GSE146771_CRC_Leukocyte_10x_Metadata$Global_Cluster
names(cell.type) <- GSE146771_CRC_Leukocyte_10x_Metadata$CellName
pbmc <- AddMetaData(object = pbmc, metadata = cell.type, col.name = "cell_type")
#Sub_ClusterID
tumor.name <- GSE146771_CRC_Leukocyte_10x_Metadata$Sub_ClusterID
names(tumor.name) <- GSE146771_CRC_Leukocyte_10x_Metadata$CellName
pbmc <- AddMetaData(object = pbmc, metadata = tumor.name, col.name = "tumor_name")

#5. Preprocessing Steps #5.1 calculate mitocondria percentage

pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")

#5.2FeatureScatter plot #FeatureScatter is typically used to visualize gene-gene relationships, but can be used for anything calculated by the object

# for anything calculated by the object, i.e. columns in object metadata, PC scores etc.
par(mfrow = c(1, 2))
FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")

FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")

#5.3filter out cells that have unique gene counts (nFeature_RNA) over 2,500 or less than

# 200 Note that > and < are used to define a'gate'.  
#-Inf and Inf should be used if you don't want a lower or upper threshold.
pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)
pbmc <- SetIdent(pbmc, value = pbmc@meta.data$cell_type)

#5.4 Normalization step

pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)

#5.5 Detection of variable genes across the single cells

pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)

#5.6 Identify the 10 most highly variable genes

top10 <- head(VariableFeatures(pbmc), 10)

#5.7 plot variable features with and without labels

par(mfrow = c(1, 2))
plot1 <- VariableFeaturePlot(pbmc)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results
plot1 + plot2

#6. scale Data

all.genes <- rownames(pbmc)
pbmc <- ScaleData(pbmc, features = all.genes)
## Centering and scaling data matrix

#6.2 Perform linear dimensional reduction (PCA)

#perform PCA on the scaled data
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))
## PC_ 1 
## Positive:  IL32, CD3D, CD2, RPL3, CD3E, CD3G, LCK, MALAT1, CD69, CCL5 
##     CD7, GZMA, IL2RG, EVL, RPSA, KLRB1, CST7, STK17A, ITM2A, CLEC2D 
##     IL7R, CXCR4, RORA, GZMM, LDHB, ISG20, CD247, NKG7, JUN, LTB 
## Negative:  CST3, LYZ, AIF1, SERPINA1, LST1, FCN1, CSTA, FCER1G, TYROBP, MNDA 
##     CD14, S100A9, S100A8, VCAN, CFD, FPR1, RETN, IFITM3, MS4A6A, CYBB 
##     PSAP, CLEC12A, SPI1, S100A12, MS4A7, GRN, CD68, CTSS, TYMP, FGL2 
## PC_ 2 
## Positive:  CD79A, DERL3, TNFRSF17, MZB1, JCHAIN, TXNDC5, PRDX4, HERPUD1, FOSB, EAF2 
##     SPAG4, UBE2J1, SEC11C, SDC1, RRBP1, BTG2, FKBP2, SSR3, LMAN1, JUN 
##     TNFRSF13B, FKBP11, MEF2C, ITM2C, HSP90B1, NR4A1, SPCS3, XBP1, CD38, FAM46C 
## Negative:  TMSB4X, S100A4, GIMAP7, NKG7, HCST, IL32, GZMH, SH3BGRL3, ACTB, CST7 
##     GZMA, CCL5, GIMAP4, PTPRC, PFN1, ANXA1, CD2, FYB, FGFBP2, KLRD1 
##     CD3D, IFITM2, CORO1A, GZMM, S100A6, TMSB10, PRF1, CD52, CD3G, MYL12A 
## PC_ 3 
## Positive:  TNFAIP3, CREM, CXCL2, NR4A2, RGS1, CXCL8, OLR1, C15ORF48, DUSP2, SAMSN1 
##     CCL20, PHLDA1, CXCL3, ZNF331, IL1B, ZFP36, G0S2, AREG, CCL3L3, SPP1 
##     DNAJB1, APOC1, RNASE1, VEGFA, DNAJA1, IER3, EREG, APOE, FOSB, HBEGF 
## Negative:  CD79A, MEF2C, CD79B, TNFRSF17, MS4A1, MZB1, DERL3, BANK1, VPREB3, TXNIP 
##     EAF2, TNFRSF13B, MALAT1, CD74, RALGPS2, TXNDC5, RPL3, PLAC8, KIAA0125, LINC00926 
##     PRDX4, SPIB, SEC11C, FCRLA, UBE2J1, RPS5, TCL1A, FTL, TNFRSF13C, TMSB10 
## PC_ 4 
## Positive:  FKBP11, DERL3, TNFRSF17, CD63, NKG7, RRBP1, SDF2L1, GZMH, PRDX4, TXNDC5 
##     XBP1, GZMA, MZB1, LGALS1, VIMP, GZMB, KLRD1, MANF, FKBP2, CTSW 
##     CCL5, SEC11C, CCL4, FGFBP2, CST7, SPAG4, SDC1, ITM2C, GSTP1, MYDGF 
## Negative:  MS4A1, LTB, BANK1, VPREB3, CD22, CXCR4, TCL1A, CCR7, SPIB, HLA-DQB1 
##     LINC00926, CD24, CD37, GPR183, CD83, HLA-DMB, TNFRSF13C, SWAP70, HLA-DQA1, LAPTM5 
##     FCER2, CD72, CD52, BTG1, CD79B, SMIM14, HLA-DRA, TLR10, PABPC1, REL 
## PC_ 5 
## Positive:  FOS, IL7R, S100A12, DUSP1, FYB, ICOS, VIM, JAML, FCN1, SARAF 
##     ITM2B, VCAN, CLEC12A, CD40LG, SRGN, TNFRSF4, CYP1B1, PBXIP1, CITED2, TBC1D4 
##     TNFRSF18, JUNB, CD28, RETN, RGCC, TRAT1, TNFRSF25, CFP, SPOCK2, CD27 
## Negative:  HLA-DPB1, HLA-DQA1, HLA-DPA1, HLA-DQB1, MS4A1, HLA-DQA2, GZMH, FGFBP2, HLA-DRB1, CCL4 
##     C15ORF48, HLA-DMB, OLR1, APOC1, NKG7, BANK1, APOE, SPP1, GPNMB, C1QC 
##     CXCL2, HLA-DRA, CD83, KLRD1, KLRF1, RNASE1, CXCL3, GZMB, GNLY, VPREB3
print(pbmc[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1 
## Positive:  IL32, CD3D, CD2, RPL3, CD3E 
## Negative:  CST3, LYZ, AIF1, SERPINA1, LST1 
## PC_ 2 
## Positive:  CD79A, DERL3, TNFRSF17, MZB1, JCHAIN 
## Negative:  TMSB4X, S100A4, GIMAP7, NKG7, HCST 
## PC_ 3 
## Positive:  TNFAIP3, CREM, CXCL2, NR4A2, RGS1 
## Negative:  CD79A, MEF2C, CD79B, TNFRSF17, MS4A1 
## PC_ 4 
## Positive:  FKBP11, DERL3, TNFRSF17, CD63, NKG7 
## Negative:  MS4A1, LTB, BANK1, VPREB3, CD22 
## PC_ 5 
## Positive:  FOS, IL7R, S100A12, DUSP1, FYB 
## Negative:  HLA-DPB1, HLA-DQA1, HLA-DPA1, HLA-DQB1, MS4A1
VizDimLoadings(pbmc, dims = 1:2, reduction = "pca")

#6.3 DimHeatmap for Genes by PCs

#PC1 plot 
DimHeatmap(pbmc, dims = 1, cells = 500, balanced = TRUE)

#6.4 DimHeatmap for Genes by PCs (15) #PC1-PC15 plots

DimHeatmap(pbmc, dims = 1:15, cells = 500, balanced = TRUE)

#DimPlot(pbmc, reduction = "pca")

#compiuting the nearest neighbor graph

pbmc <- FindNeighbors(pbmc, dims = 1:10)
## Computing nearest neighbor graph
## Computing SNN

#computing the clusters

pbmc <- FindClusters(pbmc, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 43049
## Number of edges: 1369226
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9331
## Number of communities: 17
## Elapsed time: 11 seconds

#RunMap Non-linear dimension reduction(UMAP/Tsne)

#head(Idents(pbmc), 5)
#-----------------------------------
pbmc <- RunUMAP(pbmc, dims = 1:10)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 13:31:16 UMAP embedding parameters a = 0.9922 b = 1.112
## 13:31:16 Read 43049 rows and found 10 numeric columns
## 13:31:16 Using Annoy for neighbor search, n_neighbors = 30
## 13:31:16 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 13:31:19 Writing NN index file to temp file /var/folders/vm/569pw0m92jl40jfnhyq4hrwnfhms6v/T//RtmpW5RO2z/file69de2fc3a3df
## 13:31:19 Searching Annoy index using 1 thread, search_k = 3000
## 13:31:30 Annoy recall = 100%
## 13:31:30 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 13:31:32 Initializing from normalized Laplacian + noise (using irlba)
## 13:31:35 Commencing optimization for 200 epochs, with 1783242 positive edges
## 13:31:55 Optimization finished

#Dimplot without selecting Ident( variable)

DimPlot(pbmc, reduction = "umap")

#Dimplot selecting Ident as cell_type

#using the cell_type to create a pca plot 

pbmc <- SetIdent(pbmc, value = pbmc@meta.data$cell_type)
DimPlot(pbmc, label = T , repel = T, label.size = 3) + NoLegend()

#Finding differentially expressed features (cluster biomarkers)

#Find differentially expressed features between Myeloid cell and all other cells, only
# search for positive markers


Myeloid.markers <- FindMarkers(object = pbmc, ident.1 = "Myeloid cell", min.pct = 0.25, test.use = "roc", only.pos = TRUE)

head(Myeloid.markers)
##        myAUC avg_diff power avg_log2FC pct.1 pct.2
## CST3   0.986 2.316599 0.972   3.342145 0.981 0.099
## TYROBP 0.973 1.926025 0.946   2.778667 0.993 0.161
## FCER1G 0.966 2.003203 0.932   2.890011 0.979 0.118
## LYZ    0.961 2.442739 0.922   3.524128 0.938 0.092
## AIF1   0.961 2.247990 0.922   3.243163 0.938 0.077
## LST1   0.952 2.144867 0.904   3.094389 0.929 0.091
#differential expression between two specific groups of cells, specify the ident.1 and ident.2 parameters
#DE genes between to CD4 cell & CD8 T cell

CD.markers <- FindMarkers(object = pbmc, ident.1 = "CD4 T cell", ident.2 = "CD8 T cell", min.pct = 0.25)
FeaturePlot(object = pbmc, 
            features = c("TREM1", "TREM2", "LILRB4A", "SIGLEC15", "REL", "PDL1", "PDCD1LG2","CD274", "SIGLEC10", "MERTK", "SIRPA", "PDCD1","NT5E","ENTPD1","CD276", "CD83", "TNFR2" ))
## Warning in FetchData.Seurat(object = object, vars = c(dims, "ident", features),
## : The following requested variables were not found: LILRB4A, PDL1, TNFR2

pbmc.markers <- FindAllMarkers(object = pbmc, only.pos = TRUE, min.pct = 0.25, thresh.use = 0.25)
## Calculating cluster CD4 T cell
## Calculating cluster ILC
## Calculating cluster CD8 T cell
## Calculating cluster B cell
## Calculating cluster Myeloid cell
VlnPlot(object = pbmc, features = c("TREM1", "TREM2"))

# view results

#Cell marker genes

# find all markers of cluster 1 using default parameters
markers_genes <- FindAllMarkers(pbmc, log2FC.threshold = 0.2, test.use = "wilcox",
    min.pct = 0.1, min.diff.pct = 0.2, only.pos = TRUE, max.cells.per.ident = 50,
    assay = "RNA")
## Calculating cluster CD4 T cell
## Calculating cluster ILC
## Calculating cluster CD8 T cell
## Calculating cluster B cell
## Calculating cluster Myeloid cell

#select the top 25 up regulated genes for plotting

markers_genes %>%
    group_by(cluster) %>%
    top_n(-25, p_val_adj) -> top25
top25
## # A tibble: 145 × 7
## # Groups:   cluster [5]
##       p_val avg_log2FC pct.1 pct.2  p_val_adj cluster    gene  
##       <dbl>      <dbl> <dbl> <dbl>      <dbl> <fct>      <chr> 
##  1 7.41e-11      1.43  0.929 0.302 0.00000100 CD4 T cell CD3D  
##  2 8.32e-10      1.28  0.721 0.222 0.0000113  CD4 T cell CD3E  
##  3 2.16e- 9      1.23  0.955 0.392 0.0000292  CD4 T cell IL32  
##  4 2.51e- 9      1.31  0.703 0.209 0.0000340  CD4 T cell CD3G  
##  5 5.94e- 9      1.42  0.819 0.258 0.0000805  CD4 T cell CD2   
##  6 2.30e- 8      1.52  0.843 0.321 0.000312   CD4 T cell LTB   
##  7 2.31e- 7      1.34  0.365 0.089 0.00313    CD4 T cell PBXIP1
##  8 4.40e- 7      2.13  0.679 0.106 0.00596    CD4 T cell IL7R  
##  9 1.80e- 6      1.10  0.797 0.432 0.0243     CD4 T cell LDHB  
## 10 1.89e- 6      0.775 0.59  0.324 0.0255     CD4 T cell CD69  
## # ℹ 135 more rows

#We can now select the top 25 up regulated genes for plotting

mypar(2, 5, mar = c(4, 6, 3, 1))
for (i in unique(top25$cluster)) {
    barplot(sort(setNames(top25$avg_log2FC, top25$gene)[top25$cluster == i], F),
        horiz = T, las = 1, main = paste0(i, " vs. rest"), border = "white", yaxs = "i")
    abline(v = c(0, 0.25), lty = c(1, 2))
}

markers_genes %>%
    group_by(cluster) %>%
    top_n(-5, p_val_adj) -> top5

# create a scale.data slot for the selected genes
pbmc <- ScaleData(pbmc, features = as.character(unique(top5$gene)), assay = "RNA")
## Centering and scaling data matrix
DoHeatmap(pbmc, features = as.character(unique(top5$gene)), group.by = "cell_type",
    assay = "RNA")

DotPlot(pbmc, features = rev(as.character(unique(top5$gene))), group.by = "cell_type",
    assay = "RNA") + coord_flip()

#find markers for every cluster compared to all remaining cells, report

All.markers <- FindAllMarkers(pbmc, only.pos = T, min.pct = 0.5, logfc.threshold = 0.5)
## Calculating cluster CD4 T cell
## Calculating cluster ILC
## Calculating cluster CD8 T cell
## Calculating cluster B cell
## Calculating cluster Myeloid cell
dim(All.markers)
## [1] 313   7
table(All.markers$cluster)
## 
##   CD4 T cell          ILC   CD8 T cell       B cell Myeloid cell 
##           31           41           36           66          139
All.markers %>% group_by(cluster) %>% top_n(2, avg_log2FC)
## # A tibble: 10 × 7
## # Groups:   cluster [5]
##    p_val avg_log2FC pct.1 pct.2 p_val_adj cluster      gene 
##    <dbl>      <dbl> <dbl> <dbl>     <dbl> <fct>        <chr>
##  1     0       2.13 0.679 0.106         0 CD4 T cell   IL7R 
##  2     0       1.52 0.843 0.321         0 CD4 T cell   LTB  
##  3     0       3.05 0.888 0.13          0 ILC          GNLY 
##  4     0       2.87 0.661 0.032         0 ILC          KLRF1
##  5     0       2.29 0.887 0.191         0 CD8 T cell   GZMA 
##  6     0       2.23 0.63  0.075         0 CD8 T cell   KLRD1
##  7     0       3.47 0.933 0.036         0 B cell       CD79A
##  8     0       2.87 0.644 0.087         0 B cell       MZB1 
##  9     0       3.52 0.938 0.092         0 Myeloid cell LYZ  
## 10     0       3.43 0.815 0.028         0 Myeloid cell FCN1

#Identify the markers in the clusters

FeaturePlot(pbmc, features = c("TREM1", "TREM2", "LILRB4A", "SIGLEC15", "REL", "PDL1", "PDCD1LG2","CD274", "SIGLEC10", "MERTK", "SIRPA", "PDCD1","NT5E","ENTPD1","CD276", "CD83", "TNFR2" ))
## Warning in FetchData.Seurat(object = object, vars = c(dims, "ident", features),
## : The following requested variables were not found: LILRB4A, PDL1, TNFR2

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this: